Fork me on GitHub

java读取word excel pdf及lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

lucene(15)—java读取word excel pdf及lucene搜索之正则表达式查询RegExQuery和手机邮箱查询示例

读取文本文件中的内容,找出文件中的手机号和邮箱,我自己写了一个读取文档的内容的正则查询示例,用于匹配文件中是否含有邮箱或者手机号,这个等于是对之前的文本处理工具的一个梳理,同时结合lucene内部提供的正则匹配查询RegexQuery;

废话不多说了,直接上代码,这里先对文件内容读取分类处理,分为pdf word excel 和普通文本四类,不同的种类读取文本内容不一样

pdf利用pdfbox读取内容,word和excel利用poi进行读取内容,文本文档利用jdk自带的读取

读取pdf、word、excel和普通文本文档内容(支持word excel 2007)

这里代码做了一点调整, 主要是对excel格式的空行和空列的过滤

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
package com.lucene.index.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.LinkedList;
import java.util.List;
import org.apache.pdfbox.PDFReader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import com.lucene.bean.FileBean;
public class FileUtil {
/**读取文件信息和下属文件夹
* @param folder
* @return
* @throws IOException
* @throws OpenXML4JException
* @throws XmlException
*/
public static List<FileBean> getFolderFiles(String folder) throws Exception {
List<FileBean> fileBeans = new LinkedList<FileBean>();
File file = new File(folder);
if(file.isDirectory()){
File[] files = file.listFiles();
if(files != null){
for (File file2 : files) {
fileBeans.addAll(getFolderFiles(file2.getAbsolutePath()));
}
}
}else{
FileBean bean = new FileBean();
String filePath = file.getAbsolutePath();
bean.setPath(file.getAbsolutePath());
bean.setModified(file.lastModified());
String content = "";
if(filePath.endsWith(".doc") || filePath.endsWith(".docx")){
content = readDoc(file);
}else if(filePath.endsWith(".xls") || filePath.endsWith(".xlsx")){
content = readExcel(file);
}else if(filePath.endsWith(".pdf")){
content = readPdf(file);
}else{
content = new String(Files.readAllBytes(Paths.get(folder)));
}
bean.setContent(content);
fileBeans.add(bean);
}
return fileBeans;
}
/**讀取excel文件
* @param file
* @return
* @throws IOException
* @throws InvalidFormatException
* @throws EncryptedDocumentException
*/
public static String readExcel(File file) throws Exception {
String filePath = file.getAbsolutePath();
StringBuffer content = new StringBuffer("");
if(filePath.endsWith(".xls")){
InputStream inp = new FileInputStream(filePath);
Workbook wb = WorkbookFactory.create(inp);
Sheet sheet = wb.getSheetAt(0);
for(int i = sheet.getFirstRowNum();i<= sheet.getPhysicalNumberOfRows();i++){
HSSFRow row = (HSSFRow) sheet.getRow(i);
if (row == null) {
continue;
}
for (int j = row.getFirstCellNum(); j <= row.getLastCellNum(); j++) {
if(j < 0){
continue;//增加下标判断
}
HSSFCell cell = row.getCell(j);
if (cell == null) {
continue;
}
content.append(cell.getStringCellValue());
}
}
wb.close();
inp.close();
}else{
XSSFWorkbook xwb = new XSSFWorkbook(file.getAbsolutePath());
XSSFSheet sheet = xwb.getSheetAt(0);
// 定义 row、cell
XSSFRow row;
String cell;
// 循环输出表格中的内容
for (int i = sheet.getFirstRowNum(); i < sheet.getPhysicalNumberOfRows(); i++) {
row = sheet.getRow(i);
if(row == null){
continue;
}
for (int j = row.getFirstCellNum(); j < row.getPhysicalNumberOfCells(); j++) {
// 通过 row.getCell(j).toString() 获取单元格内容,
if(j<0){
continue;
}
XSSFCell xfcell = row.getCell(j);
if(xfcell == null){
continue;
}
xfcell.setCellType(Cell.CELL_TYPE_STRING);//数值型的转成文本型
cell = xfcell.getStringCellValue();
content.append(cell+" ");
}
}
}
return content.toString();
}
/**讀取word內容
* @param file
* @return
* @throws IOException
* @throws OpenXML4JException
* @throws XmlException
*/
public static String readDoc(File file) throws IOException, XmlException, OpenXML4JException {
String filePath = file.getAbsolutePath();
if(filePath.endsWith(".doc")){
InputStream is = new FileInputStream(file);
WordExtractor ex = new WordExtractor(is);
String text2003 = ex.getText();
ex.close();
is.close();
return text2003;
}else{
OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
String text2007 = extractor.getText();
extractor.close();
return text2007;
}
}
/**讀取pdf內容
* @param file
* @return
* @throws IOException
*/
public static String readPdf(File file) throws IOException{
PDDocument doc = PDDocument.load(file.getAbsolutePath());
PDFTextStripper stripper = new PDFTextStripper();
String content = stripper.getText(doc);
doc.close();
return content;
}
}

正则查询query构建

在原有 lucene 查询的工具类的基础上加入正则查询的构建

1
2
3
4
5
6
7
8
9
10
11
/**获取regexQuery对象
* @param field
* @param regex
* @return
*/
public static Query getRegexExpQuery(String field,String regex){
Query query = null;
Term term = new Term(field, regex);
query = new RegexpQuery(term);
return query;
}

最终的searchUtil的内容为

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
package com.lucene.search;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.concurrent.ExecutorService;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.lucene.index.IndexUtil;
public class SearchUtil {
public static final Analyzer analyzer = new IKAnalyzer();
/**获取IndexSearcher对象(适合单索引目录查询使用)
* @param indexPath 索引目录
* @return
* @throws IOException
* @throws InterruptedException
*/
public static IndexSearcher getIndexSearcher(String indexPath,ExecutorService service,boolean realtime) throws IOException, InterruptedException{
DirectoryReader reader = DirectoryReader.open(IndexUtil.getIndexWriter(indexPath, true), realtime);
IndexSearcher searcher = new IndexSearcher(reader,service);
if(service != null){
service.shutdown();
}
return searcher;
}
/**多目录多线程查询
* @param parentPath 父级索引目录
* @param service 多线程查询
* @return
* @throws IOException
* @throws InterruptedException
*/
public static IndexSearcher getMultiSearcher(String parentPath,ExecutorService service,boolean realtime) throws IOException, InterruptedException{
MultiReader multiReader;
File file = new File(parentPath);
File[] files = file.listFiles();
IndexReader[] readers = new IndexReader[files.length];
if(!realtime){
for (int i = 0 ; i < files.length ; i ++) {
readers[i] = DirectoryReader.open(FSDirectory.open(Paths.get(files[i].getPath(), new String[0])));
}
}else{
for (int i = 0 ; i < files.length ; i ++) {
readers[i] = DirectoryReader.open(IndexUtil.getIndexWriter(files[i].getPath(), true), true);
}
}
multiReader = new MultiReader(readers);
IndexSearcher searcher = new IndexSearcher(multiReader,service);
if(service != null){
service.shutdown();
}
return searcher;
}
/**从指定配置项中查询
* @return
* @param analyzer 分词器
* @param field 字段
* @param fieldType 字段类型
* @param queryStr 查询条件
* @param range 是否区间查询
* @return
*/
public static Query getQuery(String field,String fieldType,String queryStr,boolean range){
Query q = null;
if(queryStr != null && !"".equals(queryStr)){
if(range){
String[] strs = queryStr.split("\\|");
if("int".equals(fieldType)){
int min = new Integer(strs[0]);
int max = new Integer(strs[1]);
q = NumericRangeQuery.newIntRange(field, min, max, true, true);
}else if("double".equals(fieldType)){
Double min = new Double(strs[0]);
Double max = new Double(strs[1]);
q = NumericRangeQuery.newDoubleRange(field, min, max, true, true);
}else if("float".equals(fieldType)){
Float min = new Float(strs[0]);
Float max = new Float(strs[1]);
q = NumericRangeQuery.newFloatRange(field, min, max, true, true);
}else if("long".equals(fieldType)){
Long min = new Long(strs[0]);
Long max = new Long(strs[1]);
q = NumericRangeQuery.newLongRange(field, min, max, true, true);
}
}else{
if("int".equals(fieldType)){
q = NumericRangeQuery.newIntRange(field, new Integer(queryStr), new Integer(queryStr), true, true);
}else if("double".equals(fieldType)){
q = NumericRangeQuery.newDoubleRange(field, new Double(queryStr), new Double(queryStr), true, true);
}else if("float".equals(fieldType)){
q = NumericRangeQuery.newFloatRange(field, new Float(queryStr), new Float(queryStr), true, true);
}else{
Term term = new Term(field, queryStr);
q = new TermQuery(term);
}
}
}else{
q= new MatchAllDocsQuery();
}
System.out.println(q);
return q;
}
/**多条件查询类似于sql in
* @param querys
* @return
*/
public static Query getMultiQueryLikeSqlIn(Query ... querys){
BooleanQuery query = new BooleanQuery();
for (Query subQuery : querys) {
query.add(subQuery,Occur.SHOULD);
}
return query;
}
/**获取regexQuery对象
* @param field
* @param regex
* @return
*/
public static Query getRegexExpQuery(String field,String regex){
Query query = null;
Term term = new Term(field, regex);
query = new RegexpQuery(term);
return query;
}
/**多条件查询类似于sql and
* @param querys
* @return
*/
public static Query getMultiQueryLikeSqlAnd(Query ... querys){
BooleanQuery query = new BooleanQuery();
for (Query subQuery : querys) {
query.add(subQuery,Occur.MUST);
}
return query;
}
/**对多个条件进行排序构建排序条件
* @param fields
* @param type
* @param reverses
* @return
*/
public static Sort getSortInfo(String[] fields,Type[] types,boolean[] reverses){
SortField[] sortFields = null;
int fieldLength = fields.length;
int typeLength = types.length;
int reverLength = reverses.length;
if(!(fieldLength == typeLength) || !(fieldLength == reverLength)){
return null;
}else{
sortFields = new SortField[fields.length];
for (int i = 0; i < fields.length; i++) {
sortFields[i] = new SortField(fields[i], types[i], reverses[i]);
}
}
return new Sort(sortFields);
}
/**根据查询器、查询条件、每页数、排序条件进行查询
* @param query 查询条件
* @param first 起始值
* @param max 最大值
* @param sort 排序条件
* @return
*/
public static TopDocs getScoreDocsByPerPageAndSortField(IndexSearcher searcher,Query query, int first,int max, Sort sort){
try {
if(query == null){
System.out.println(" Query is null return null ");
return null;
}
TopFieldCollector collector = null;
if(sort != null){
collector = TopFieldCollector.create(sort, first+max, false, false, false);
}else{
sort = new Sort(new SortField[]{new SortField("modified", SortField.Type.LONG)});
collector = TopFieldCollector.create(sort, first+max, false, false, false);
}
searcher.search(query, collector);
return collector.topDocs(first, max);
} catch (IOException e) {
// TODO Auto-generated catch block
}
return null;
}
/**获取上次索引的id,增量更新使用
* @return
*/
public static Integer getLastIndexBeanID(IndexReader multiReader){
Query query = new MatchAllDocsQuery();
IndexSearcher searcher = null;
searcher = new IndexSearcher(multiReader);
SortField sortField = new SortField("id", SortField.Type.INT,true);
Sort sort = new Sort(new SortField[]{sortField});
TopDocs docs = getScoreDocsByPerPageAndSortField(searcher,query, 0, 1, sort);
ScoreDoc[] scoreDocs = docs.scoreDocs;
int total = scoreDocs.length;
if(total > 0){
ScoreDoc scoreDoc = scoreDocs[0];
Document doc = null;
try {
doc = searcher.doc(scoreDoc.doc);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return new Integer(doc.get("id"));
}
return 0;
}
}

正则查询测试

正则查询测试类,主要是测试是否包含手机号或邮箱号,这里的手机号验证有点粗糙,希望不要介意

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
package com.lucene.index.test;
import java.io.IOException;
import java.util.concurrent.Executors;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import com.lucene.search.SearchUtil;
public class TestSearch {
public static void main(String[] args) {
try {
IndexSearcher searcher = SearchUtil.getMultiSearcher("index", Executors.newCachedThreadPool(), false);
Query phoneQuery = SearchUtil.getRegexExpQuery("content", "1[0-9]{10}");
Query mailQuery = SearchUtil.getRegexExpQuery("content", "([a-z0-9A-Z]+[-_|\\.]?)+[a-z0-9A-Z]*@([a-z0-9A-Z]+(-[a-z0-9A-Z]+)?\\.)+[a-zA-Z]{2,}");
Query finaQuery = SearchUtil.getMultiQueryLikeSqlIn(new Query[]{phoneQuery,mailQuery});
TopDocs topDocs = SearchUtil.getScoreDocsByPerPageAndSortField(searcher, finaQuery, 0, 20, null);
System.out.println("符合条件的数据总数:"+topDocs.totalHits);
System.out.println("本次查询到的数目为:"+topDocs.scoreDocs.length);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println(doc.get("path")+" "+doc.get("content"));
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

最终测试结果如下:

1
2
3
4
5
6
7
8
9
10
11
12
content:/1[0-9]{10}/
content:/([a-z0-9A-Z]+[-_|\.]?)+[a-z0-9A-Z]*@([a-z0-9A-Z]+(-[a-z0-9A-Z]+)?\.)+[a-zA-Z]{2,}/
符合条件的数据总数:6
本次查询到的数目为:6
D:\hadoop\lucene_regexSearch\testDir\2.txt.txt 电话号码:18519237811
D:\hadoop\lucene_regexSearch\testDir\3.txt.txt 电子邮箱yinggui_Wu@163.com
D:\hadoop\lucene_regexSearch\testDir\1.docx 邮箱内容yinggui_Wu@163.com
D:\hadoop\lucene_regexSearch\testDir\1.pdf 邮箱内容 yinggui_Wu@163.com
D:\hadoop\lucene_regexSearch\testDir\1.xlsx 1 2 3 18510539956
D:\hadoop\lucene_regexSearch\testDir\1.txt.txt <a target=_blank href="mailto:fanyi@qq.com">fanyi@qq.com</a>

代码下载地址

http://download.csdn.net/detail/wuyinggui10000/8746407

坚持原创技术分享,您的支持将鼓励我继续创作!